In [7]:

    
import pandas as pd
import numpy
import json
from collections import defaultdict
from matplotlib.pylab import style
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648









    



Populating the interactive namespace from numpy and matplotlib



In [8]:

    
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])



In [3]:

    
def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return q_str 
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[0] #cos the format will always end with a |



In [4]:

    
for col in ['place_of_birth','gender', 'citizenship','ethnic_group']:
    allrecs[col] = allrecs[col].apply(split_column)



In [5]:

    
allrecs.head(5)









    Out[5]:






  
    
      
      qid
      dob
      dod
      gender
      ethnic_group
      citizenship
      place_of_birth
      site_links
    
  
  
    
      0
        Q23
       1732
       1799
       Q6581097
       NaN
        Q30
       Q494413
       zhwiki|kywiki|euwiki|plwiki|bswiki|angwiki|uzw...
    
    
      1
        Q42
       1952
       2001
       Q6581097
       NaN
       Q145
          Q350
       zhwiki|jvwiki|euwiki|plwiki|bswiki|eswiki|tawi...
    
    
      2
       Q207
       1946
        NaN
       Q6581097
       NaN
        Q30
        Q49145
       uzwiki|eswiki|kowikiquote|huwiki|liwikiquote|p...
    
    
      3
       Q297
        NaN
       1660
       Q6581097
       NaN
        Q29
         Q8717
       zhwiki|kywiki|plwiki|euwiki|bswiki|uzwiki|eswi...
    
    
      4
       Q326
       1942
        NaN
       Q6581097
       NaN
       Q298
         Q2887
       zhwiki|plwiki|euwiki|kowiki|frwiki|eswiki|yowi...

Explanation of Aggregation Maps

#todo what about mechanical maps

pobs_map.json
- pob qid $\mapsto$ country qid
citizenships_map.csv
- citizenship $\leftrightarrow$ english country name
country_maps.csv
- country qid $\leftrightarrow$ english country name $\leftrightarrow$ world culture



In [6]:

    
pobs_map = json.load(open('helpers/aggregation_maps/pobs_map.json','r'))
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')

ethnic_group_map = json.load(open('helpers/aggregation_maps/mechanical_turk/ethnic_groups_map.json','r'))
citizenship_map = json.load(open('helpers/aggregation_maps/mechanical_turk/citizenship_map.json','r')) 

def map_pob(qid):
    if not type(qid) is str:
        return None
    else:
        country_list = pobs_map[qid]
        if len(country_list) == 0:
            return None
        else:
            country = country_list[0] #assumption
            culture = country_map.ix[country]['culture_name']
            return culture

def map_wrapper(m):
    def return_fun(qid):
        try:
            return m[qid]
        except KeyError:
            return None
    return return_fun

mismatch = pd.DataFrame()


#order is important because it determines the preference we will use
col_map_fun = zip(['ethnic_group', 'citizenship', 'place_of_birth'],
                  [map_wrapper(ethnic_group_map),map_wrapper(citizenship_map), map_pob])

def determine_culture(row):
    culture = None
    for col, map_fun in col_map_fun:
        guess = map_fun(row[col])
        if (culture is not None) and (guess is not None):
            if culture != guess:
                mismatch.append(row,ignore_index=True)
        if guess:
            culture = guess
    
    return str(culture).lower() if culture else culture #to return None properly



In [173]:

    
%%timeit -r 1 -n 1
allrecs.iloc[0:2500].apply(lambda x: determine_culture(x), axis=1)









    



1 loops, best of 1: 1.77 s per loop



In [174]:

    
%%timeit -r 1 -n 1
allrecs.iloc[0:25000].apply(lambda x: determine_culture(x), axis=1)









    



1 loops, best of 1: 17 s per loop



In [17]:

    
allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-17-5dc819b596ea> in <module>()
----> 1 allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   3594                     if reduce is None:
   3595                         reduce = True
-> 3596                     return self._apply_standard(f, axis, reduce=reduce)
   3597             else:
   3598                 return self._apply_broadcast(f, axis)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce)
   3646                 labels = self._get_agg_axis(axis)
   3647                 result = lib.reduce(values, func, axis=axis, dummy=dummy,
-> 3648                                     labels=labels)
   3649                 return Series(result, index=labels)
   3650             except Exception:

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.reduce (pandas/lib.c:40234)()

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.Reducer.get_result (pandas/lib.c:30025)()

<ipython-input-17-5dc819b596ea> in <lambda>(x)
----> 1 allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)

<ipython-input-16-ff2687e8a0fd> in determine_culture(row)
     38         if (culture is not None) and (guess is not None):
     39             if culture != guess:
---> 40                 mismatch.append(row,ignore_index=True)
     41         if guess:
     42             culture = guess

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in append(self, other, ignore_index, verify_integrity)
   3818             to_concat = [self, other]
   3819         return concat(to_concat, ignore_index=ignore_index,
-> 3820                       verify_integrity=verify_integrity)
   3821 
   3822     def join(self, other, on=None, how='left', lsuffix='', rsuffix='',

/usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
    723                        verify_integrity=verify_integrity,
    724                        copy=copy)
--> 725     return op.get_result()
    726 
    727 

/usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in get_result(self)
    894 
    895             new_data = concatenate_block_managers(
--> 896                 mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy)
    897             if not self.copy:
    898                 new_data._consolidate_inplace()

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy)
   4044                                                 copy=copy),
   4045                          placement=placement)
-> 4046               for placement, join_units in concat_plan]
   4047 
   4048     return BlockManager(blocks, axes)

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in concatenate_join_units(join_units, concat_axis, copy)
   4133         raise AssertionError("Concatenating join units along axis0")
   4134 
-> 4135     empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units)
   4136 
   4137     to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype,

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in get_empty_dtype_and_na(join_units)
   4072             has_none_blocks = True
   4073         else:
-> 4074             dtypes[i] = unit.dtype
   4075 
   4076     # dtypes = set()

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.cache_readonly.__get__ (pandas/lib.c:40766)()

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in dtype(self)
   4343             raise AssertionError("Block is None, no dtype")
   4344 
-> 4345         if not self.needs_filling:
   4346             return self.block.dtype
   4347         else:

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.cache_readonly.__get__ (pandas/lib.c:40766)()

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in needs_filling(self)
   4333         for indexer in self.indexers.values():
   4334             # FIXME: cache results of indexer == -1 checks.
-> 4335             if (indexer == -1).any():
   4336                 return True
   4337 

KeyboardInterrupt:



In [195]:

    
print mismatch









    



Empty DataFrame
Columns: []
Index: []



In [176]:

    
allrecs.to_json('helpers/world_cultures_shortcut.json')



In [5]:

    
allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))



In [201]:

    
import scipy.stats
scipy.stats.spearmanr(rank_compare[['Rank','Rank_wikidata']])









    Out[201]:





(0.09637690726400637, 0.25388210576052661)



In [207]:

    
scipy.stats.mannwhitneyu(rank_compare['Rank'],rank_compare['Rank_wikidata'])









    Out[207]:





(10078.0, 0.49798226262171613)



In [208]:

    
scipy.stats.ranksums(rank_compare['Rank'],rank_compare['Rank_wikidata'])









    Out[208]:





(0.0057801597300572065, 0.99538812547307132)



In [205]:

    
print rank_compare.to_html()









    



<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Economy</th>
      <th>Rank</th>
      <th>Rank_wikidata</th>
      <th>diff</th>
      <th>abs_diff</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0  </th>
      <td>                    Iceland</td>
      <td>   1</td>
      <td>  73</td>
      <td> -72</td>
      <td>  72</td>
    </tr>
    <tr>
      <th>1  </th>
      <td>                    Finland</td>
      <td>   2</td>
      <td>  49</td>
      <td> -47</td>
      <td>  47</td>
    </tr>
    <tr>
      <th>2  </th>
      <td>                     Norway</td>
      <td>   3</td>
      <td>  58</td>
      <td> -55</td>
      <td>  55</td>
    </tr>
    <tr>
      <th>3  </th>
      <td>                     Sweden</td>
      <td>   4</td>
      <td>  32</td>
      <td> -28</td>
      <td>  28</td>
    </tr>
    <tr>
      <th>4  </th>
      <td>                    Denmark</td>
      <td>   5</td>
      <td>  59</td>
      <td> -54</td>
      <td>  54</td>
    </tr>
    <tr>
      <th>5  </th>
      <td>                  Nicaragua</td>
      <td>   6</td>
      <td>  25</td>
      <td> -19</td>
      <td>  19</td>
    </tr>
    <tr>
      <th>6  </th>
      <td>                     Rwanda</td>
      <td>   7</td>
      <td>  67</td>
      <td> -60</td>
      <td>  60</td>
    </tr>
    <tr>
      <th>7  </th>
      <td>                    Ireland</td>
      <td>   8</td>
      <td>  77</td>
      <td> -69</td>
      <td>  69</td>
    </tr>
    <tr>
      <th>8  </th>
      <td>                Philippines</td>
      <td>   9</td>
      <td>   2</td>
      <td>   7</td>
      <td>   7</td>
    </tr>
    <tr>
      <th>9  </th>
      <td>                    Belgium</td>
      <td>  10</td>
      <td>  98</td>
      <td> -88</td>
      <td>  88</td>
    </tr>
    <tr>
      <th>10 </th>
      <td>                Switzerland</td>
      <td>  11</td>
      <td> 116</td>
      <td>-105</td>
      <td> 105</td>
    </tr>
    <tr>
      <th>11 </th>
      <td>                    Germany</td>
      <td>  12</td>
      <td> 117</td>
      <td>-105</td>
      <td> 105</td>
    </tr>
    <tr>
      <th>12 </th>
      <td>                New Zealand</td>
      <td>  13</td>
      <td>  38</td>
      <td> -25</td>
      <td>  25</td>
    </tr>
    <tr>
      <th>13 </th>
      <td>                Netherlands</td>
      <td>  14</td>
      <td>  72</td>
      <td> -58</td>
      <td>  58</td>
    </tr>
    <tr>
      <th>14 </th>
      <td>                     Latvia</td>
      <td>  15</td>
      <td>  60</td>
      <td> -45</td>
      <td>  45</td>
    </tr>
    <tr>
      <th>15 </th>
      <td>                     France</td>
      <td>  16</td>
      <td>  96</td>
      <td> -80</td>
      <td>  80</td>
    </tr>
    <tr>
      <th>16 </th>
      <td>                    Burundi</td>
      <td>  17</td>
      <td> 141</td>
      <td>-124</td>
      <td> 124</td>
    </tr>
    <tr>
      <th>17 </th>
      <td>               South Africa</td>
      <td>  18</td>
      <td> 103</td>
      <td> -85</td>
      <td>  85</td>
    </tr>
    <tr>
      <th>18 </th>
      <td>                     Canada</td>
      <td>  19</td>
      <td>  23</td>
      <td>  -4</td>
      <td>   4</td>
    </tr>
    <tr>
      <th>19 </th>
      <td>   United States of America</td>
      <td>  20</td>
      <td>  31</td>
      <td> -11</td>
      <td>  11</td>
    </tr>
    <tr>
      <th>20 </th>
      <td>                    Ecuador</td>
      <td>  21</td>
      <td> 109</td>
      <td> -88</td>
      <td>  88</td>
    </tr>
    <tr>
      <th>21 </th>
      <td>                   Bulgaria</td>
      <td>  22</td>
      <td>  53</td>
      <td> -31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>22 </th>
      <td>                   Slovenia</td>
      <td>  23</td>
      <td>  78</td>
      <td> -55</td>
      <td>  55</td>
    </tr>
    <tr>
      <th>23 </th>
      <td>                  Australia</td>
      <td>  24</td>
      <td>  18</td>
      <td>   6</td>
      <td>   6</td>
    </tr>
    <tr>
      <th>24 </th>
      <td>                    Moldova</td>
      <td>  25</td>
      <td>  68</td>
      <td> -43</td>
      <td>  43</td>
    </tr>
    <tr>
      <th>25 </th>
      <td>             United Kingdom</td>
      <td>  26</td>
      <td>  42</td>
      <td> -16</td>
      <td>  16</td>
    </tr>
    <tr>
      <th>26 </th>
      <td>                 Mozambique</td>
      <td>  27</td>
      <td>  61</td>
      <td> -34</td>
      <td>  34</td>
    </tr>
    <tr>
      <th>27 </th>
      <td>                 Luxembourg</td>
      <td>  28</td>
      <td> 107</td>
      <td> -79</td>
      <td>  79</td>
    </tr>
    <tr>
      <th>28 </th>
      <td>                      Spain</td>
      <td>  29</td>
      <td>  88</td>
      <td> -59</td>
      <td>  59</td>
    </tr>
    <tr>
      <th>29 </th>
      <td>                       Cuba</td>
      <td>  30</td>
      <td>  26</td>
      <td>   4</td>
      <td>   4</td>
    </tr>
    <tr>
      <th>30 </th>
      <td>                  Argentina</td>
      <td>  31</td>
      <td> 102</td>
      <td> -71</td>
      <td>  71</td>
    </tr>
    <tr>
      <th>31 </th>
      <td>                    Belarus</td>
      <td>  32</td>
      <td>  70</td>
      <td> -38</td>
      <td>  38</td>
    </tr>
    <tr>
      <th>32 </th>
      <td>                   Barbados</td>
      <td>  33</td>
      <td>  55</td>
      <td> -22</td>
      <td>  22</td>
    </tr>
    <tr>
      <th>33 </th>
      <td>                     Malawi</td>
      <td>  34</td>
      <td>  92</td>
      <td> -58</td>
      <td>  58</td>
    </tr>
    <tr>
      <th>34 </th>
      <td>                The Bahamas</td>
      <td>  35</td>
      <td>  36</td>
      <td>  -1</td>
      <td>   1</td>
    </tr>
    <tr>
      <th>35 </th>
      <td>                    Austria</td>
      <td>  36</td>
      <td>  82</td>
      <td> -46</td>
      <td>  46</td>
    </tr>
    <tr>
      <th>36 </th>
      <td>                      Kenya</td>
      <td>  37</td>
      <td>   9</td>
      <td>  28</td>
      <td>  28</td>
    </tr>
    <tr>
      <th>37 </th>
      <td>                    Lesotho</td>
      <td>  38</td>
      <td>  43</td>
      <td>  -5</td>
      <td>   5</td>
    </tr>
    <tr>
      <th>38 </th>
      <td>                   Portugal</td>
      <td>  39</td>
      <td>  95</td>
      <td> -56</td>
      <td>  56</td>
    </tr>
    <tr>
      <th>39 </th>
      <td>                    Namibia</td>
      <td>  40</td>
      <td> 112</td>
      <td> -72</td>
      <td>  72</td>
    </tr>
    <tr>
      <th>40 </th>
      <td>                 Madagascar</td>
      <td>  41</td>
      <td>  99</td>
      <td> -58</td>
      <td>  58</td>
    </tr>
    <tr>
      <th>41 </th>
      <td>                   Mongolia</td>
      <td>  42</td>
      <td>  71</td>
      <td> -29</td>
      <td>  29</td>
    </tr>
    <tr>
      <th>42 </th>
      <td>                 Kazakhstan</td>
      <td>  43</td>
      <td>  44</td>
      <td>  -1</td>
      <td>   1</td>
    </tr>
    <tr>
      <th>43 </th>
      <td>                  Lithuania</td>
      <td>  44</td>
      <td>  65</td>
      <td> -21</td>
      <td>  21</td>
    </tr>
    <tr>
      <th>44 </th>
      <td>                       Peru</td>
      <td>  45</td>
      <td>  97</td>
      <td> -52</td>
      <td>  52</td>
    </tr>
    <tr>
      <th>45 </th>
      <td>                     Panama</td>
      <td>  46</td>
      <td>  39</td>
      <td>   7</td>
      <td>   7</td>
    </tr>
    <tr>
      <th>46 </th>
      <td>                   Tanzania</td>
      <td>  47</td>
      <td>  16</td>
      <td>  31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>47 </th>
      <td>                 Costa Rica</td>
      <td>  48</td>
      <td> 129</td>
      <td> -81</td>
      <td>  81</td>
    </tr>
    <tr>
      <th>48 </th>
      <td>        Trinidad and Tobago</td>
      <td>  49</td>
      <td>  24</td>
      <td>  25</td>
      <td>  25</td>
    </tr>
    <tr>
      <th>49 </th>
      <td>                 Cape Verde</td>
      <td>  50</td>
      <td> 136</td>
      <td> -86</td>
      <td>  86</td>
    </tr>
    <tr>
      <th>50 </th>
      <td>                   Botswana</td>
      <td>  51</td>
      <td>  46</td>
      <td>   5</td>
      <td>   5</td>
    </tr>
    <tr>
      <th>51 </th>
      <td>                    Jamaica</td>
      <td>  52</td>
      <td>  21</td>
      <td>  31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>52 </th>
      <td>                   Colombia</td>
      <td>  53</td>
      <td>  63</td>
      <td> -10</td>
      <td>  10</td>
    </tr>
    <tr>
      <th>53 </th>
      <td>                     Serbia</td>
      <td>  54</td>
      <td>  62</td>
      <td>  -8</td>
      <td>   8</td>
    </tr>
    <tr>
      <th>54 </th>
      <td>                    Croatia</td>
      <td>  55</td>
      <td>  86</td>
      <td> -31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>55 </th>
      <td>                    Ukraine</td>
      <td>  56</td>
      <td>  79</td>
      <td> -23</td>
      <td>  23</td>
    </tr>
    <tr>
      <th>56 </th>
      <td>                     Poland</td>
      <td>  57</td>
      <td>  84</td>
      <td> -27</td>
      <td>  27</td>
    </tr>
    <tr>
      <th>57 </th>
      <td>                    Bolivia</td>
      <td>  58</td>
      <td> 128</td>
      <td> -70</td>
      <td>  70</td>
    </tr>
    <tr>
      <th>58 </th>
      <td>                  Singapore</td>
      <td>  59</td>
      <td>   8</td>
      <td>  51</td>
      <td>  51</td>
    </tr>
    <tr>
      <th>59 </th>
      <td>                       Laos</td>
      <td>  60</td>
      <td> 137</td>
      <td> -77</td>
      <td>  77</td>
    </tr>
    <tr>
      <th>60 </th>
      <td>                   Thailand</td>
      <td>  61</td>
      <td>  11</td>
      <td>  50</td>
      <td>  50</td>
    </tr>
    <tr>
      <th>61 </th>
      <td>                    Estonia</td>
      <td>  62</td>
      <td>  94</td>
      <td> -32</td>
      <td>  32</td>
    </tr>
    <tr>
      <th>62 </th>
      <td>                   Zimbabwe</td>
      <td>  63</td>
      <td>  35</td>
      <td>  28</td>
      <td>  28</td>
    </tr>
    <tr>
      <th>63 </th>
      <td>                     Guyana</td>
      <td>  64</td>
      <td> 134</td>
      <td> -70</td>
      <td>  70</td>
    </tr>
    <tr>
      <th>64 </th>
      <td>                     Israel</td>
      <td>  65</td>
      <td>  34</td>
      <td>  31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>65 </th>
      <td>                      Chile</td>
      <td>  66</td>
      <td>  47</td>
      <td>  19</td>
      <td>  19</td>
    </tr>
    <tr>
      <th>66 </th>
      <td>                 Kyrgyzstan</td>
      <td>  67</td>
      <td>  51</td>
      <td>  16</td>
      <td>  16</td>
    </tr>
    <tr>
      <th>67 </th>
      <td>                 Bangladesh</td>
      <td>  68</td>
      <td>  64</td>
      <td>   4</td>
      <td>   4</td>
    </tr>
    <tr>
      <th>68 </th>
      <td>                      Italy</td>
      <td>  69</td>
      <td> 105</td>
      <td> -36</td>
      <td>  36</td>
    </tr>
    <tr>
      <th>69 </th>
      <td>      Republic of Macedonia</td>
      <td>  70</td>
      <td> 125</td>
      <td> -55</td>
      <td>  55</td>
    </tr>
    <tr>
      <th>70 </th>
      <td>                     Brazil</td>
      <td>  71</td>
      <td>  57</td>
      <td>  14</td>
      <td>  14</td>
    </tr>
    <tr>
      <th>71 </th>
      <td>                    Romania</td>
      <td>  72</td>
      <td>  54</td>
      <td>  18</td>
      <td>  18</td>
    </tr>
    <tr>
      <th>72 </th>
      <td>                   Honduras</td>
      <td>  73</td>
      <td> 131</td>
      <td> -58</td>
      <td>  58</td>
    </tr>
    <tr>
      <th>73 </th>
      <td>                 Montenegro</td>
      <td>  74</td>
      <td>  81</td>
      <td>  -7</td>
      <td>   7</td>
    </tr>
    <tr>
      <th>74 </th>
      <td>                     Russia</td>
      <td>  75</td>
      <td>  52</td>
      <td>  23</td>
      <td>  23</td>
    </tr>
    <tr>
      <th>75 </th>
      <td>                    Vietnam</td>
      <td>  76</td>
      <td>  29</td>
      <td>  47</td>
      <td>  47</td>
    </tr>
    <tr>
      <th>76 </th>
      <td>                    Senegal</td>
      <td>  77</td>
      <td> 119</td>
      <td> -42</td>
      <td>  42</td>
    </tr>
    <tr>
      <th>77 </th>
      <td>         Dominican Republic</td>
      <td>  78</td>
      <td>  22</td>
      <td>  56</td>
      <td>  56</td>
    </tr>
    <tr>
      <th>78 </th>
      <td>                  Sri Lanka</td>
      <td>  79</td>
      <td>  80</td>
      <td>  -1</td>
      <td>   1</td>
    </tr>
    <tr>
      <th>79 </th>
      <td>                     Mexico</td>
      <td>  80</td>
      <td>  45</td>
      <td>  35</td>
      <td>  35</td>
    </tr>
    <tr>
      <th>80 </th>
      <td>                   Paraguay</td>
      <td>  81</td>
      <td> 132</td>
      <td> -51</td>
      <td>  51</td>
    </tr>
    <tr>
      <th>81 </th>
      <td>                    Uruguay</td>
      <td>  82</td>
      <td> 135</td>
      <td> -53</td>
      <td>  53</td>
    </tr>
    <tr>
      <th>82 </th>
      <td>                    Albania</td>
      <td>  83</td>
      <td> 115</td>
      <td> -32</td>
      <td>  32</td>
    </tr>
    <tr>
      <th>83 </th>
      <td>                El Salvador</td>
      <td>  84</td>
      <td> 113</td>
      <td> -29</td>
      <td>  29</td>
    </tr>
    <tr>
      <th>84 </th>
      <td>                    Georgia</td>
      <td>  85</td>
      <td>  91</td>
      <td>  -6</td>
      <td>   6</td>
    </tr>
    <tr>
      <th>85 </th>
      <td>                  Venezuela</td>
      <td>  86</td>
      <td>  12</td>
      <td>  74</td>
      <td>  74</td>
    </tr>
    <tr>
      <th>86 </th>
      <td> People's Republic of China</td>
      <td>  87</td>
      <td>  13</td>
      <td>  74</td>
      <td>  74</td>
    </tr>
    <tr>
      <th>87 </th>
      <td>                     Uganda</td>
      <td>  88</td>
      <td>  20</td>
      <td>  68</td>
      <td>  68</td>
    </tr>
    <tr>
      <th>88 </th>
      <td>                  Guatemala</td>
      <td>  89</td>
      <td> 120</td>
      <td> -31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>89 </th>
      <td>                   Slovakia</td>
      <td>  90</td>
      <td>  56</td>
      <td>  34</td>
      <td>  34</td>
    </tr>
    <tr>
      <th>90 </th>
      <td>                     Greece</td>
      <td>  91</td>
      <td>  83</td>
      <td>   8</td>
      <td>   8</td>
    </tr>
    <tr>
      <th>91 </th>
      <td>                  Swaziland</td>
      <td>  92</td>
      <td>  14</td>
      <td>  78</td>
      <td>  78</td>
    </tr>
    <tr>
      <th>92 </th>
      <td>                    Hungary</td>
      <td>  93</td>
      <td>  66</td>
      <td>  27</td>
      <td>  27</td>
    </tr>
    <tr>
      <th>93 </th>
      <td>                 Azerbaijan</td>
      <td>  94</td>
      <td> 106</td>
      <td> -12</td>
      <td>  12</td>
    </tr>
    <tr>
      <th>94 </th>
      <td>                     Cyprus</td>
      <td>  95</td>
      <td> 111</td>
      <td> -16</td>
      <td>  16</td>
    </tr>
    <tr>
      <th>95 </th>
      <td>             Czech Republic</td>
      <td>  96</td>
      <td>  87</td>
      <td>   9</td>
      <td>   9</td>
    </tr>
    <tr>
      <th>96 </th>
      <td>                  Indonesia</td>
      <td>  97</td>
      <td>  17</td>
      <td>  80</td>
      <td>  80</td>
    </tr>
    <tr>
      <th>97 </th>
      <td>                     Brunei</td>
      <td>  98</td>
      <td>  14</td>
      <td>  84</td>
      <td>  84</td>
    </tr>
    <tr>
      <th>98 </th>
      <td>                      Malta</td>
      <td>  99</td>
      <td> 122</td>
      <td> -23</td>
      <td>  23</td>
    </tr>
    <tr>
      <th>99 </th>
      <td>                     Belize</td>
      <td> 100</td>
      <td>  40</td>
      <td>  60</td>
      <td>  60</td>
    </tr>
    <tr>
      <th>100</th>
      <td>                      Ghana</td>
      <td> 101</td>
      <td> 133</td>
      <td> -32</td>
      <td>  32</td>
    </tr>
    <tr>
      <th>101</th>
      <td>                 Tajikistan</td>
      <td> 102</td>
      <td>  85</td>
      <td>  17</td>
      <td>  17</td>
    </tr>
    <tr>
      <th>102</th>
      <td>                    Armenia</td>
      <td> 103</td>
      <td> 126</td>
      <td> -23</td>
      <td>  23</td>
    </tr>
    <tr>
      <th>103</th>
      <td>                      Japan</td>
      <td> 104</td>
      <td>   4</td>
      <td> 100</td>
      <td> 100</td>
    </tr>
    <tr>
      <th>104</th>
      <td>                   Maldives</td>
      <td> 105</td>
      <td> 101</td>
      <td>   4</td>
      <td>   4</td>
    </tr>
    <tr>
      <th>105</th>
      <td>                  Mauritius</td>
      <td> 106</td>
      <td>   6</td>
      <td> 100</td>
      <td> 100</td>
    </tr>
    <tr>
      <th>106</th>
      <td>                   Malaysia</td>
      <td> 107</td>
      <td>   5</td>
      <td> 102</td>
      <td> 102</td>
    </tr>
    <tr>
      <th>107</th>
      <td>                   Cambodia</td>
      <td> 108</td>
      <td>  30</td>
      <td>  78</td>
      <td>  78</td>
    </tr>
    <tr>
      <th>108</th>
      <td>                   Suriname</td>
      <td> 109</td>
      <td>  74</td>
      <td>  35</td>
      <td>  35</td>
    </tr>
    <tr>
      <th>109</th>
      <td>               Burkina Faso</td>
      <td> 110</td>
      <td> 138</td>
      <td> -28</td>
      <td>  28</td>
    </tr>
    <tr>
      <th>110</th>
      <td>                    Liberia</td>
      <td> 111</td>
      <td>  50</td>
      <td>  61</td>
      <td>  61</td>
    </tr>
    <tr>
      <th>111</th>
      <td>                      Nepal</td>
      <td> 112</td>
      <td>   3</td>
      <td> 109</td>
      <td> 109</td>
    </tr>
    <tr>
      <th>112</th>
      <td>                     Kuwait</td>
      <td> 113</td>
      <td> 114</td>
      <td>  -1</td>
      <td>   1</td>
    </tr>
    <tr>
      <th>113</th>
      <td>                      India</td>
      <td> 114</td>
      <td>  19</td>
      <td>  95</td>
      <td>  95</td>
    </tr>
    <tr>
      <th>114</th>
      <td>       United Arab Emirates</td>
      <td> 115</td>
      <td> 118</td>
      <td>  -3</td>
      <td>   3</td>
    </tr>
    <tr>
      <th>115</th>
      <td>                      Qatar</td>
      <td> 116</td>
      <td> 139</td>
      <td> -23</td>
      <td>  23</td>
    </tr>
    <tr>
      <th>116</th>
      <td>                South Korea</td>
      <td> 117</td>
      <td>   1</td>
      <td> 116</td>
      <td> 116</td>
    </tr>
    <tr>
      <th>117</th>
      <td>                    Nigeria</td>
      <td> 118</td>
      <td>  92</td>
      <td>  26</td>
      <td>  26</td>
    </tr>
    <tr>
      <th>118</th>
      <td>                     Zambia</td>
      <td> 119</td>
      <td> 127</td>
      <td>  -8</td>
      <td>   8</td>
    </tr>
    <tr>
      <th>119</th>
      <td>                     Bhutan</td>
      <td> 120</td>
      <td>  33</td>
      <td>  87</td>
      <td>  87</td>
    </tr>
    <tr>
      <th>120</th>
      <td>                     Angola</td>
      <td> 121</td>
      <td>  90</td>
      <td>  31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>121</th>
      <td>                       Fiji</td>
      <td> 122</td>
      <td> 104</td>
      <td>  18</td>
      <td>  18</td>
    </tr>
    <tr>
      <th>122</th>
      <td>                    Tunisia</td>
      <td> 123</td>
      <td> 110</td>
      <td>  13</td>
      <td>  13</td>
    </tr>
    <tr>
      <th>123</th>
      <td>                    Bahrain</td>
      <td> 124</td>
      <td>   6</td>
      <td> 118</td>
      <td> 118</td>
    </tr>
    <tr>
      <th>124</th>
      <td>                     Turkey</td>
      <td> 125</td>
      <td>  41</td>
      <td>  84</td>
      <td>  84</td>
    </tr>
    <tr>
      <th>125</th>
      <td>                    Algeria</td>
      <td> 126</td>
      <td>  75</td>
      <td>  51</td>
      <td>  51</td>
    </tr>
    <tr>
      <th>126</th>
      <td>                   Ethiopia</td>
      <td> 127</td>
      <td>  10</td>
      <td> 117</td>
      <td> 117</td>
    </tr>
    <tr>
      <th>127</th>
      <td>                       Oman</td>
      <td> 128</td>
      <td>  28</td>
      <td> 100</td>
      <td> 100</td>
    </tr>
    <tr>
      <th>128</th>
      <td>                      Egypt</td>
      <td> 129</td>
      <td>  48</td>
      <td>  81</td>
      <td>  81</td>
    </tr>
    <tr>
      <th>129</th>
      <td>               Saudi Arabia</td>
      <td> 130</td>
      <td> 123</td>
      <td>   7</td>
      <td>   7</td>
    </tr>
    <tr>
      <th>130</th>
      <td>                 Mauritania</td>
      <td> 131</td>
      <td> 142</td>
      <td> -11</td>
      <td>  11</td>
    </tr>
    <tr>
      <th>131</th>
      <td>                     Guinea</td>
      <td> 132</td>
      <td> 140</td>
      <td>  -8</td>
      <td>   8</td>
    </tr>
    <tr>
      <th>132</th>
      <td>                    Morocco</td>
      <td> 133</td>
      <td> 100</td>
      <td>  33</td>
      <td>  33</td>
    </tr>
    <tr>
      <th>133</th>
      <td>                     Jordan</td>
      <td> 134</td>
      <td> 108</td>
      <td>  26</td>
      <td>  26</td>
    </tr>
    <tr>
      <th>134</th>
      <td>                    Lebanon</td>
      <td> 135</td>
      <td>  76</td>
      <td>  59</td>
      <td>  59</td>
    </tr>
    <tr>
      <th>135</th>
      <td>              Côte d'Ivoire</td>
      <td> 136</td>
      <td> 130</td>
      <td>   6</td>
      <td>   6</td>
    </tr>
    <tr>
      <th>136</th>
      <td>                       Iran</td>
      <td> 137</td>
      <td>  69</td>
      <td>  68</td>
      <td>  68</td>
    </tr>
    <tr>
      <th>137</th>
      <td>                       Mali</td>
      <td> 138</td>
      <td> 124</td>
      <td>  14</td>
      <td>  14</td>
    </tr>
    <tr>
      <th>138</th>
      <td>                      Syria</td>
      <td> 139</td>
      <td> 121</td>
      <td>  18</td>
      <td>  18</td>
    </tr>
    <tr>
      <th>139</th>
      <td>                       Chad</td>
      <td> 140</td>
      <td>  37</td>
      <td> 103</td>
      <td> 103</td>
    </tr>
    <tr>
      <th>140</th>
      <td>                   Pakistan</td>
      <td> 141</td>
      <td>  27</td>
      <td> 114</td>
      <td> 114</td>
    </tr>
    <tr>
      <th>141</th>
      <td>                      Yemen</td>
      <td> 142</td>
      <td>  88</td>
      <td>  54</td>
      <td>  54</td>
    </tr>
  </tbody>
</table>

Quite uncorrellated. That means that the data is not good, or that the world economic forum methods have little to do with the percentage of women born in those countries recorded semantically on a historic level. And /rho is high



In [6]:

    
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')



In [7]:

    
def map_culture(qid):
    if not type(qid) is str:
        return None
    else:
        country_list = pobs_map[qid]
        if len(country_list) == 0:
            return None
        else:
            country = country_list[0] #assumption
            culture = country_map.ix[country]['culture_name']
            return culture



In [15]:

    
allrecs['culture'] = allrecs['place_of_birth'].apply(map_culture)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-ce3e2a640676> in <module>()
----> 1 allrecs['culture'] = allrecs['place_of_birth'].apply(map_culture)

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2056             values = lib.map_infer(values, lib.Timestamp)
   2057 
-> 2058         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2059         if len(mapped) and isinstance(mapped[0], Series):
   2060             from pandas.core.frame import DataFrame

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:57158)()

<ipython-input-14-dd8662bb0567> in map_culture(qid)
      3         return None
      4     else:
----> 5         country_list = pobs_map[qid]
      6         if len(country_list) == 0:
      7             return None

NameError: global name 'pobs_map' is not defined



In [32]:

    
import math
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if qid:
        if type(qid) is float:
            if math.isnan(qid):
                return None
        #first see if we've done it
        try:
            return retrieved[qid]
        except KeyError:
            try:
                page = pywikibot.ItemPage(wikidata, qid)
                data = page.get()
                lab = data['labels']['en']
                retrieved[qid] = lab
                return lab
            except KeyError:
                retrieved[qid] = qid
                return qid
    else:
        return None



In [33]:

    
english_label('Q6581097')









    Out[33]:





u'male'



In [34]:

    
allrecs['gender_name'] = allrecs['gender'].apply(english_label)









    



VERBOSE:pywiki:Found 1 commons:commons processes running, including this one.



In [ ]:

    
outdf = allrecs[['gender_name','culture']]



In [ ]:

    
outdf.to_csv('helpers/Chi_Squared_Test_Data.csv')

how many records have gender, pob and dob

Making the 3bin genpobdob



In [9]:

    
has = defaultdict(dict)
for col in allrecs.columns:
    def test(x):
        if isinstance(x, float):
            return not math.isnan(x)
        else:
            return x is not None
    nonempty = len(allrecs[allrecs[col].apply(test)])
    nonemptyper = nonempty / float(len(allrecs))
    
    has[col]['Items with property'] = nonempty
    has[col]['% of total'] = nonemptyper

hasdf = pd.DataFrame.from_dict(has, orient='index')



In [10]:

    
print hasdf.sort('% of total').to_html(justify='right', formatters={'% of total':lambda x: '%.2f' % (x*100), 
                                                   'Items with property':lambda x: '{0:,}'.format(x)})









    



<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>% of total</th>
      <th>Items with property</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>ethnic_group</th>
      <td>  0.30</td>
      <td>    7,772</td>
    </tr>
    <tr>
      <th>country</th>
      <td> 23.47</td>
      <td>  601,361</td>
    </tr>
    <tr>
      <th>place_of_birth</th>
      <td> 23.93</td>
      <td>  613,092</td>
    </tr>
    <tr>
      <th>dod</th>
      <td> 28.79</td>
      <td>  737,522</td>
    </tr>
    <tr>
      <th>citizenship</th>
      <td> 41.44</td>
      <td>1,061,634</td>
    </tr>
    <tr>
      <th>culture</th>
      <td> 45.20</td>
      <td>1,158,086</td>
    </tr>
    <tr>
      <th>dob</th>
      <td> 57.92</td>
      <td>1,484,003</td>
    </tr>
    <tr>
      <th>gender</th>
      <td> 89.40</td>
      <td>2,290,433</td>
    </tr>
    <tr>
      <th>site_links</th>
      <td> 99.05</td>
      <td>2,537,545</td>
    </tr>
    <tr>
      <th>qid</th>
      <td>100.00</td>
      <td>2,561,999</td>
    </tr>
  </tbody>
</table>



In [11]:

    
hasdob = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
len(hasdob)









    Out[11]:





1484003



In [12]:

    
hasgender = hasdob[hasdob['gender'].apply(lambda x: not math.isnan(x) if type(x) is float else True)]
len(hasgender)









    Out[12]:





1484003



In [13]:

    
hascult = hasgender[hasgender['culture'].apply(lambda x: x is not None)]
len(hascult)









    Out[13]:





915101



In [14]:

    
hascult.head()









    Out[14]:






  
    
      
      citizenship
      country
      culture
      dob
      dod
      ethnic_group
      gender
      place_of_birth
      qid
      site_links
    
  
  
    
      0
        Q30
        Q30
        english-speaking
       1732
       1799
       None
       Q6581097
        Q494413
          Q23
       zhwiki|kywiki|euwiki|plwiki|bswiki|angwiki|uzw...
    
    
      1
       Q145
       Q145
        english-speaking
       1952
       2001
       None
       Q6581097
           Q350
          Q42
       zhwiki|jvwiki|euwiki|plwiki|bswiki|eswiki|tawi...
    
    
      10
        Q30
        Q30
        english-speaking
       1973
        NaN
       None
       Q6581072
       Q1020700
         Q555
       zhwiki|eowiki|plwiki|kowiki|ruwiki|frwiki|eswi...
    
    
      100
        Q36
       None
       catholic european
       1989
        NaN
       None
       Q6581097
           None
        Q2327
       dewiki|plwiki|ruwiki|enwiki|ocwiki|svwiki|dawiki|
    
    
      1000
        Q21
       None
        english-speaking
       1944
       1994
       None
       Q6581097
           None
       Q28348
       ptwiki|plwiki|kowiki|hewiki|frwiki|ruwiki|eswi...



In [15]:

    
culture_groups = hascult.groupby('culture')



In [16]:

    
def make_perc_series(df):
    years_per = dict()
    dobs = df.groupby('dob')
    #hate to use a for loop, fixlater
    for year, group in dobs:
        nmcount = group[group['gender'] != 'Q6581097']['gender'].count()
        totalcount = group['gender'].count()
        nmper = nmcount / float(totalcount)
        years_per[year] = nmper
        perc_series = pd.TimeSeries(data=years_per)
    
    return perc_series
    
perc_dict = dict()
for name, group in culture_groups:
    perc_series = make_perc_series(group)
    perc_dict[name] = perc_series



In [34]:

    
perc_df.tail(10)









    Out[34]:






  
    
      
      africa
      catholic european
      confucian
      english-speaking
      islamic
      latin america
      orthodox
      protestant european
      south asia
      euro
    
  
  
    
      2007
        0
       1.000000
        0
       0.6
        1
      NaN
      NaN
       0.750000
        0
       1.750000
    
    
      2008
      NaN
       1.000000
      NaN
       1.0
      NaN
      NaN
      NaN
            NaN
      NaN
            NaN
    
    
      2009
      NaN
       0.000000
      NaN
       0.0
      NaN
      NaN
      NaN
       0.000000
        0
       0.000000
    
    
      2010
        0
            NaN
      NaN
       0.4
      NaN
      NaN
        0
       0.000000
        0
            NaN
    
    
      2011
      NaN
       0.333333
      NaN
       0.0
      NaN
        1
        0
       0.250000
        1
       0.583333
    
    
      2012
      NaN
            NaN
      NaN
       0.0
        0
      NaN
        0
       0.666667
        0
            NaN
    
    
      2013
      NaN
       0.000000
      NaN
       0.0
      NaN
        0
        0
            NaN
        0
            NaN
    
    
      2014
      NaN
            NaN
      NaN
       0.5
      NaN
      NaN
      NaN
            NaN
      NaN
            NaN
    
    
      2411
      NaN
            NaN
      NaN
       NaN
      NaN
      NaN
      NaN
            NaN
      NaN
            NaN
    
    
      2426
      NaN
            NaN
      NaN
       NaN
        0
      NaN
        0
            NaN
      NaN
            NaN



In [35]:

    
perc_df = pd.DataFrame.from_dict(perc_dict)
years = range(1800,2000,int(200/6.0))
subbd_df = perc_df.ix[years]
infogram = subbd_df
infogram.to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8')



In [37]:

    
fig, (full, modern) = plt.subplots(1,2, figsize=(20,6))

end_year = 2000

for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)):
    ra_dict = dict()
    for name, series in perc_dict.iteritems():
        ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
    cult_dob_per  = pd.DataFrame(ra_dict)
    
    if start_year == 1800:
        year_list = range(1900,end_year,10)
        cult_dob_per.ix[years].to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8')
    

    cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len)
    ax.set_xlim((start_year, end_year))
    ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16))
    ax.set_ylim((0,0.6))
    ax.set_title(u'{}—{}, with {} year Rolling Average'.format(start_year, end_year,ra_len))
    ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))

full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0)
full.set_xticks(range(-1000, end_year,(end_year+1000) / 15))
fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24)
fig.subplots_adjust(top=0.88)



In [27]:

    
fig, (full, modern) = plt.subplots(2,1, figsize=(12,8), sharex=False)

end_year = 2000

for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)):
    ra_dict = dict()
    for name, series in perc_dict.iteritems():
        ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
    cult_dob_per  = pd.DataFrame(ra_dict)
    cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len)
    ax.set_xlim((start_year, end_year))
    ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16))
    ax.set_ylim((0,0.6))
    ax.set_title(u'{}—{}, with {} year Rolling Average'.format(start_year, end_year,ra_len))
    ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))

full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0)
#full.set_xticks(range(-1000, end_year,(end_year+1000) / 15))
fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24)
fig.subplots_adjust(top=0.88)

Make dobculture



In [184]:

    
dobexists = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
dobcultureexists = dobexists[dobexists['culture'].apply(lambda x: x is not None)]
len(dobcultureexists)









    Out[184]:





915101



In [185]:

    
culture_groups = dobcultureexists[['dob','culture']].groupby(by='culture')



In [186]:

    
def make_tot_series(df):
    years_tot = dict()
    dobs = df.groupby('dob')
    #hate to use a for loop, fixlater
    for year, group in dobs:
        totalcount = group['culture'].count()
        years_tot[year] = totalcount
        tot_series = pd.TimeSeries(data=years_tot)
    
    return tot_series
    
tot_dict = dict()
for name, group in culture_groups:
    tot_dict[name] = make_tot_series(group)



In [189]:

    
end_year = 2014
for start_year in [1500, 1800]:
    for ra_len in [2, 5, 10]:
        ra_dict = dict()
        for name, series in tot_dict.iteritems():
            ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)

        cult_dob = pd.DataFrame(ra_dict)
        plt = cult_dob.plot(figsize=(20,6),  cmap='Set2', linewidth=1.5)
        plt.set_xlim((start_year,end_year))
        plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
        plt.set_title('Total Biographies by Date of Birth |  %s Year Rolling Average' % str(ra_len))
        plt.legend(loc=2)



In [188]:

    
for start_year, end_year in zip([-2000, -1000], [1000,1500]):
    for ra_len in [1,2,10]:
        ra_dict = dict()
        for name, series in tot_dict.iteritems():
            ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
        cult_dob = pd.DataFrame(ra_dict)
        plt = cult_dob.plot(figsize=(20,6),  cmap='Set2', linewidth=1.5)
        plt.set_ylim((0,50))
        plt.set_yscale('log')
        plt.set_xlim((start_year,end_year))
        plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
        plt.set_title('Total Biographies by Date of Birth |  %s Year Rolling Average' % str(ra_len))
        plt.legend(loc=2)









    



/usr/local/lib/python2.7/dist-packages/numpy/ma/core.py:3895: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")



In [ ]:

	qid	dob	dod	gender	ethnic_group	citizenship	place_of_birth	site_links
0	Q23	1732	1799	Q6581097	NaN	Q30	Q494413	zhwiki\|kywiki\|euwiki\|plwiki\|bswiki\|angwiki\|uzw...
1	Q42	1952	2001	Q6581097	NaN	Q145	Q350	zhwiki\|jvwiki\|euwiki\|plwiki\|bswiki\|eswiki\|tawi...
2	Q207	1946	NaN	Q6581097	NaN	Q30	Q49145	uzwiki\|eswiki\|kowikiquote\|huwiki\|liwikiquote\|p...
3	Q297	NaN	1660	Q6581097	NaN	Q29	Q8717	zhwiki\|kywiki\|plwiki\|euwiki\|bswiki\|uzwiki\|eswi...
4	Q326	1942	NaN	Q6581097	NaN	Q298	Q2887	zhwiki\|plwiki\|euwiki\|kowiki\|frwiki\|eswiki\|yowi...

	citizenship	country	culture	dob	dod	ethnic_group	gender	place_of_birth	qid	site_links
0	Q30	Q30	english-speaking	1732	1799	None	Q6581097	Q494413	Q23	zhwiki\|kywiki\|euwiki\|plwiki\|bswiki\|angwiki\|uzw...
1	Q145	Q145	english-speaking	1952	2001	None	Q6581097	Q350	Q42	zhwiki\|jvwiki\|euwiki\|plwiki\|bswiki\|eswiki\|tawi...
10	Q30	Q30	english-speaking	1973	NaN	None	Q6581072	Q1020700	Q555	zhwiki\|eowiki\|plwiki\|kowiki\|ruwiki\|frwiki\|eswi...
100	Q36	None	catholic european	1989	NaN	None	Q6581097	None	Q2327	dewiki\|plwiki\|ruwiki\|enwiki\|ocwiki\|svwiki\|dawiki\|
1000	Q21	None	english-speaking	1944	1994	None	Q6581097	None	Q28348	ptwiki\|plwiki\|kowiki\|hewiki\|frwiki\|ruwiki\|eswi...

	africa	catholic european	confucian	english-speaking	islamic	latin america	orthodox	protestant european	south asia	euro
2007	0	1.000000	0	0.6	1	NaN	NaN	0.750000	0	1.750000
2008	NaN	1.000000	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN
2009	NaN	0.000000	NaN	0.0	NaN	NaN	NaN	0.000000	0	0.000000
2010	0	NaN	NaN	0.4	NaN	NaN	0	0.000000	0	NaN
2011	NaN	0.333333	NaN	0.0	NaN	1	0	0.250000	1	0.583333
2012	NaN	NaN	NaN	0.0	0	NaN	0	0.666667	0	NaN
2013	NaN	0.000000	NaN	0.0	NaN	0	0	NaN	0	NaN
2014	NaN	NaN	NaN	0.5	NaN	NaN	NaN	NaN	NaN	NaN
2411	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2426	NaN	NaN	NaN	NaN	0	NaN	0	NaN	NaN	NaN